/* File 1 of 3 to create CCK results. */



#delimit ;
clear all;
local outfile "GWgapr10_CCKnorm";
set more off;


di _n "$S_DATE $S_TIME";







********************************************************************************;
* Creating firm-level data set;

use if Q_rest==1 using GWgap_pr_firm_v4, clear;


* making value added per worker;

gen val_ad_pw = (exp(lngo) - exp(lnM))/(L_hc_t);
replace val_ad_pw = val_ad_pw/10000;
label var val_ad_pw "Value added per worker (real $0,000s)";

gen lnval_ad_pw = ln(10000*val_ad_pw);
label var lnval_ad_pw "Value added per worker (ln)";

sum lnval_ad_pw, d;


* marking extreme values of value added per worker;

gen extreme = lnval_ad_pw<8.7 | lnval_ad_pw>=12.5;
label var extreme "Firm has ln(value added per worker) below 8.7 or >=12.5";


* creating percentiles of log value added per worker;

gen L_hc_rdd = round(L_hc_t);

xtile pc_val_ad_pw = lnval_ad_pw [fweight = L_hc_rdd], n(100);
label var pc_val_ad_pw "Percentile of value added per worker distribution";

drop if pc_val_ad_pw==.;
compress;
save temp_befcol, replace;




use temp_befcol, clear;

gen norent = lnval_ad_pw<9.95;
label var norent "Firm-years with ln value added per worker < 9.95";
keep if norent==1;
keep pent year;
notes : temp_norent.dta created by GWgapr10_CCKnorm.do.;
save temp_norent, replace;

#delimit ;
use if Q_rest==1 using GWgap_pr_IDI_v4;

rename hp_pent pent;
merge m:1 pent year using temp_norent, keep(master match);
gen norent = _m==3;
drop _m;

sum norent;
bys female: sum norent;



********************************************************************************;
* Graphing average male firm FE and average female firm FE for each VA/L percentile against avg lnVA/L; 

use temp_befcol, clear;


drop if pc_val_ad_pw==.;
collapse (mean) ffe_m ffe_f lnval_ad_pw [fweight = L_hc_rdd], by(pc_val_ad_pw);

graph twoway (scatter ffe_m lnval_ad_pw, mc(black))
	(scatter ffe_f lnval_ad_pw, mc(gs8) msym(D)),
	graphregion(fcolor(white) lcolor(white) ifcolor(white) ilcolor(white))
	xtitle("Mean value added per worker (ln)")
	ytitle("Mean fixed effect")
	legend(label(1 "Males") label(2 "Females"))
	xline(10, lc(gs5) lp(dash));
graph export "`outfile'_feva.pdf", replace;
*graph export "`outfile'_feva.emf", replace;

* export data set here to make figure 2 graph;

order lnval_ad_pw ffe_m ffe_f;
label var ffe_m "Weighted average firm fixed effect for men";
label var ffe_f "Weighted average firm fixed effect for women";
label var lnval_ad_pw "Weighted average log of value added per worker";

export excel using "`outfile'_fig2.xlsx", sheet("data_incextr") sheetrep first(varl);



********************************************************************************;
* SUR regs of FE on value added minus T to estimate optimal T, dropping extreme values;

use temp_befcol, clear;


* range of T to test;

local T_lower 8.7;
local T_step 0.1;
local T_upper 12.5;

local Rsq_best 0;
local T_best 0;

matrix define SUR = J(1,5,.);
matrix colnames SUR = T Rsquared beta_m beta_f beta_m_f;

local T 9.95;

while `T' < `T_upper' {;

	di "T = `T'";
	capture matrix drop SURad;
	matrix define SURad = J(1,5,.);
	matrix SURad[1,1] = `T';

	capture drop X;
	gen X = max(lnval_ad_pw - `T',0);

	sureg (ffe_m X) (ffe_f X) [fweight = L_hc_rdd] if extreme==0;

	matrix SURad[1,2] = (e(r2_1) + e(r2_2))/2; * R-squared for system is average of R-squareds for eqns;
	matrix SURad[1,3] = _b[ffe_m:X];
	matrix SURad[1,4] = _b[ffe_f:X];
	matrix SURad[1,5] = _b[ffe_m:X]/_b[ffe_f:X];

	matrix SUR = SUR \ SURad;
	
	if (e(r2_1) + e(r2_2))/2 > `Rsq_best' local T_best = `T';
	if (e(r2_1) + e(r2_2))/2 > `Rsq_best' local Rsq_best = (e(r2_1) + e(r2_2))/2;

	if `T'==9.95 local T `T_lower';
	local T = `T' + `T_step';
};

matrix list SUR; /* this is used to 
di "Best R2 = `Rsq_best'";
di "Best T = `T_best'";


*** normalising FEs;

foreach g in m f {;

	di "Average male and female firm FE, weighted by firm size, for firms below ln(value added per worker)<9.95";
	di "Save these to a file";
	sum ffe_`g' if lnval_ad_pw< `T_best' [fweight = L_hc_rdd];
	gen ffe_`g'nva = ffe_`g' - r(mean);
};

collapse (mean) ffe_mnva ffe_fnva lnval_ad_pw [fweight = L_hc_rdd], by(pc_val_ad_pw);

putexcel set "`outfile'_SUR", modify;
putexcel A1 = matrix(SUR), colnames;
compress;
save `outfile'_for_fig2, replace;





******************;
* graph scatterplot of ffe_fnva ffe_mnva here with 45 degree line;

* Note best t for full sample is 9.95;

use `outfile'_for_fig5, clear;

gen x = -0.01 if _n==1;
replace x = 0.32 if _n==2;
gen y = -0.01 if _n==1;
replace y = 0.32 if _n==2;


graph twoway (scatter ffe_fnva ffe_mnva)
	(scatter y x, c(l) mstyle(none) lc(gs8)),
	ytitle("Mean female fixed effects") xtitle("Mean male fixed effects")
	graphregion(fcolor(white) lcolor(white) ifcolor(white) ilcolor(white))
	legend(off)
	xtitle("Mean male fixed effects") ytitle("Mean female fixed effects");

graph export "`outfile'_mf.pdf", replace;
*graph export "`outfile'_mf.emf", replace;

